The main components of storytelling with data are:
Before diving into analyzing the data, it’s useful to research some background information. In this case, we know a few facts that will be useful:
import pandas
import numpy as np
files = ["ap_results.csv", "class_size.csv", "demographics.csv", "graduation.csv", "hs_directory.csv", "math_test_results.csv", "sat_results.csv"]
data = {}
for f in files:
d = pandas.read_csv("./data/NYC-Schools/{0}".format(f),encoding="ISO-8859-1")
data[f.replace(".csv", "")] = d
#data["hs_directory"].head()
for k,v in data.items():
print("\n" + k + "\n")
print(v.head())
Initial thoughts:
All of the datasets have a DBN (unique code for each school) except for the class_size dataset. We will need to see if we can create this column there.
# what our DBM should look like
data["demographics"]["DBN"].head()
data["class_size"].head()
# constructing the DBM for class_size
data["class_size"]["DBN"] = data["class_size"].apply(lambda x: "{0:02d}{1}".format(x["CSD"], x["SCHOOL CODE"]), axis=1)
data["class_size"]["DBN"].head()
survey1 = pandas.read_csv("./data/NYC-Schools/survey_all.txt", delimiter="\t", encoding='windows-1252')
survey2 = pandas.read_csv("./data/NYC-Schools/survey_d75.txt", delimiter="\t", encoding='windows-1252')
survey1["d75"] = False
survey2["d75"] = True
survey = pandas.concat([survey1, survey2], axis=0)
survey.head()
# only keep useful columns
survey["DBN"] = survey["dbn"]
survey_fields = ["DBN", "rr_s", "rr_t", "rr_p", "N_s", "N_t", "N_p", "saf_p_11", "com_p_11", "eng_p_11", "aca_p_11", "saf_t_11", "com_t_11", "eng_t_10", "aca_t_11", "saf_s_11", "com_s_11", "eng_s_11", "aca_s_11", "saf_tot_11", "com_tot_11", "eng_tot_11", "aca_tot_11",]
survey = survey.loc[:,survey_fields]
data["survey"] = survey
survey.shape
data["class_size"].head()
The class_size dataset has multiple rows per school - for different grades and program types. We'll need to condense these in order to join them with the other datasets. To filter out the duplicates, we can:
class_size = data["class_size"]
class_size = class_size[(class_size["GRADE "] == "09-12") & \
(class_size["PROGRAM TYPE"] == "GEN ED")]
class_size = class_size.groupby("DBN").agg(np.mean)
class_size.reset_index(inplace=True)
data["class_size"] = class_size
data["class_size"].head()
The demographics dataset also contains duplicates for different school years. We will filter to show onlythe 2011-2012 year.
demographics = data["demographics"]
demographics = demographics[demographics["schoolyear"] == 20112012]
data["demographics"] = demographics
data["demographics"].head()
The math_test_results dataset contains multiple rows per school for different years and grades. We will filter to show 2011 scores for grade 8 only.
data["math_test_results"] = data["math_test_results"][data["math_test_results"]["Year"] == 2011]
data["math_test_results"] = data["math_test_results"][data["math_test_results"]["Grade"] == '8']
data["math_test_results"].head()
The graduation dataset contains multiple rows per school for different cohorts and demographics. We will filter for 2006 and Total Cohort only.
data["graduation"] = data["graduation"][data["graduation"]["Cohort"] == "2006"]
data["graduation"] = data["graduation"][data["graduation"]["Demographic"] == "Total Cohort"]
data["graduation"].head()
We can compute a total SAT score from the individual columns SAT Math Avg. Score, SAT Critical Reading Avg. Score, and SAT Writing Avg. Score. In the below code, we:
data["sat_results"].head()
# we want to convert the scores from string objects to numbers
data["sat_results"].info()
cols = ['SAT Math Avg. Score', 'SAT Critical Reading Avg. Score', 'SAT Writing Avg. Score']
for c in cols:
data["sat_results"][c] = pandas.to_numeric(data["sat_results"][c])
data['sat_results']['sat_score'] = data['sat_results'][cols[0]] + data['sat_results'][cols[1]] + data['sat_results'][cols[2]]
data["sat_results"].info()
We can parse out the coordinate locations of each school to plot the schools on a map.
data["hs_directory"]['lat'] = data["hs_directory"]['Location 1'].apply(lambda x: x.split("\n")[-1].replace("(", "").replace(")", "").split(", ")[0])
data["hs_directory"]['lon'] = data["hs_directory"]['Location 1'].apply(lambda x: x.split("\n")[-1].replace("(", "").replace(")", "").split(", ")[1])
for c in ['lat', 'lon']:
data["hs_directory"][c] = pandas.to_numeric(data["hs_directory"][c])
data["hs_directory"].head()
http://pandas.pydata.org/pandas-docs/stable/merging.html
flat_data_names = [k for k,v in data.items()]
flat_data = [data[k] for k in flat_data_names]
full = flat_data[0]
for i, f in enumerate(flat_data[1:]):
name = flat_data_names[i+1]
print(name)
print(len(f["DBN"]) - len(f["DBN"].unique()))
join_type = "inner"
if name in ["sat_results", "ap_2010", "graduation"]:
join_type = "outer"
if name not in ["math_test_results"]:
full = full.merge(f, on="DBN", how=join_type)
full.shape
Notice that some rows in the ap_2010 file contain 's' which we will need to change.
data["ap_results"].head()
# transform AP columns to numbers and fill in nulls
cols = ['Num of AP Test Takers', 'Num of AP Total Exams Taken', 'Num of AP Exams Passed']
for col in cols:
full[col] = pandas.to_numeric(full[col])
full[cols] = full[cols].fillna(value=0)
Add in column for school district based first 2 digits on DBN.
full["school_dist"] = full["DBN"].apply(lambda x: x[:2])
Fill in any missing values in full with the mean of the column, so we can compute correlations.
full = full.fillna(full.mean())
full.corr()['sat_score']
This gives us quite a few insights that we’ll need to explore:
full.head()
import folium
from folium import plugins
schools_map = folium.Map(location=[full['lat'].mean(), full['lon'].mean()], zoom_start=10)
marker_cluster = folium.MarkerCluster().add_to(schools_map)
for name, row in full.iterrows():
folium.Marker([row["lat"], row["lon"]], popup="{0}: {1}".format(row["DBN"], row["SCHOOL NAME_x"])).add_to(marker_cluster)
schools_map
A Heatmap might make it easier to see the schools.
schools_heatmap = folium.Map(location=[full['lat'].mean(), full['lon'].mean()], zoom_start=10)
schools_heatmap.add_child(plugins.HeatMap([[row["lat"], row["lon"]] for name, row in full.iterrows()]))
schools_heatmap
We can compute SAT score by school district, then plot this out on a map.
district_data = full.groupby("school_dist").agg(np.mean)
district_data.reset_index(inplace=True)
district_data["school_dist"] = district_data["school_dist"].apply(lambda x: str(int(x)))
district_data.head()
geo_path = './data/NYC-Schools/districts.geojson'
districts = folium.Map(location=[full['lat'].mean(), full['lon'].mean()], zoom_start=10)
districts.choropleth(geo_path=geo_path, data=district_data,
columns=['school_dist', 'sat_score'],
key_on='feature.properties.school_dist',
fill_color='YlGn', fill_opacity=0.7, line_opacity=0.7,
legend_name='SAT Score')
# add markers
# for name, row in district_data.iterrows():
# folium.CircleMarker([row["lat"], row["lon"]], radius=10, popup="{0}: {1}".format(row["school_dist"], row["sat_score"])).add_to(districts)
#folium.LatLngPopup().add_to(districts)
districts.save('districts.html')
districts
%matplotlib inline
full.plot.scatter(x='total_enrollment', y='sat_score')
full[["DBN","SCHOOL NAME_x","total_enrollment"]]
full[pandas.isnull(full["School Name"])][["SCHOOL NAME_x", "DBN", "School Name"]]
full.plot.scatter(x='ell_percent', y='sat_score')